package us.codecraft.webmagic.lianjia.processor;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;

@TargetUrl("https://sh.lianjia.com/ershoufang/\\w+.html")
public class LianjiaRepo implements AfterExtractor {
    @ExtractBy(value = "//div[@class='title']/div[@class='sub']/text()")
    private String name;

    public void afterProcess(Page page) {
        //jfinal的属性其实是一个Map而不是字段，没关系，填充进去就是了
        System.out.println(name);
    }

    public static void main(String[] args) {
        OOSpider.create(Site.me().setSleepTime(1000), new ConsolePageModelPipeline(), LianjiaRepo.class)
                .setScheduler(new FileCacheQueueScheduler("/Users/jaylee/Downloads"))
                .addUrl("https://sh.lianjia.com/ershoufang/beicai/").thread(1).run();
    }
}
